This file is used to analyse the immune cells dataset.
library(dplyr)
library(patchwork)
library(ggplot2)
library(ComplexHeatmap)
library(org.Mm.eg.db)
.libPaths()
## [1] "/usr/local/lib/R/library"
In this section, we set the global settings of the analysis. We will store data there :
save_name = "matrix"
out_dir = "."
We load the dataset :
sobj = readRDS(paste0(out_dir, "/", save_name, "_sobj.rds"))
sobj
## An object of class Seurat
## 15937 features across 1529 samples within 1 assay
## Active assay: RNA (15937 features, 2000 variable features)
## 6 dimensional reductions calculated: RNA_pca, RNA_pca_19_tsne, RNA_pca_19_umap, harmony, harmony_19_umap, harmony_19_tsne
We load the sample information :
sample_info = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_sample_info.rds"))
project_names_oi = sample_info$project_name
graphics::pie(rep(1, nrow(sample_info)),
col = sample_info$color,
labels = sample_info$project_name)
Here are custom colors for each cell type :
color_markers = readRDS(paste0(out_dir, "/../../1_metadata/hs_hd_color_markers.rds"))
color_markers = color_markers[unique(sobj$cluster_type)]
data.frame(cell_type = names(color_markers),
color = unlist(color_markers)) %>%
ggplot2::ggplot(., aes(x = cell_type, y = 0, fill = cell_type)) +
ggplot2::geom_point(pch = 21, size = 5) +
ggplot2::scale_fill_manual(values = unlist(color_markers), breaks = names(color_markers)) +
ggplot2::theme_classic() +
ggplot2::theme(legend.position = "none",
axis.line = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(angle = 30, hjust = 1))
This is the projection of interest :
name2D = "harmony_19_tsne"
We design a custom functions to represent cells of interest on the projection :
see_clusters = function(pop_oi = "IRS") {
# Clusters containing population of interest
clusters_oi = as.character(unique(sobj$seurat_clusters[sobj$cluster_type == pop_oi]))
# Colors for clusters
custom_colors = aquarius::gg_color_hue(length(levels(sobj$seurat_clusters)))
names(custom_colors) = levels(sobj$seurat_clusters)
custom_colors[!(names(custom_colors) %in% clusters_oi)] = "gray92"
p1 = Seurat::DimPlot(sobj, reduction = name2D, cols = custom_colors,
group.by = "seurat_clusters", label = TRUE) +
ggplot2::labs(title = "Clusters of interest",
subtitle = paste0(clusters_oi, collapse = ", ")) +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)) +
Seurat::NoAxes() + Seurat::NoLegend()
# Color for cell type
custom_colors = unlist(color_markers)
custom_colors[!(names(custom_colors) %in% pop_oi)] = "gray92"
p2 = Seurat::DimPlot(sobj, reduction = name2D,
group.by = "cluster_type", cols = custom_colors) +
ggplot2::labs(title = "Annotation of interest",
subtitle = pop_oi) +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)) +
Seurat::NoAxes() + Seurat::NoLegend()
# Piechart for each cluster, by sample name
plot_list = lapply(clusters_oi, FUN = function(one_cluster) {
df = sobj@meta.data %>%
dplyr::filter(.data$seurat_clusters == .env$one_cluster) %>%
dplyr::select(sample_identifier, seurat_clusters)
p = aquarius::plot_piechart(df,
grouping_var = "sample_identifier",
colors = sample_info$color) +
ggplot2::labs(title = paste0("Cluster ", one_cluster),
subtitle = paste0(nrow(df), " cells")) +
ggplot2::theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
return(p)
})
p3 = patchwork::wrap_plots(plot_list)
# Patchwork
p = patchwork::wrap_plots(p2, p1, p3, nrow = 1)
return(p)
}
We design a custom function to make the GSEA plot and a word cloud graph :
make_gsea_plot = function(gsea_results, gs_oi, fold_change, metric = "FC") {
fold_change$metric = fold_change[, metric]
plot_list = lapply(gs_oi, FUN = function(gene_set) {
# Gene set content
gs_content = gene_sets %>%
dplyr::filter(gs_name == gene_set) %>%
dplyr::pull(ensembl_gene) %>%
unique()
# Gene set size
nb_genes = length(gs_content)
# Enrichment metrics
NES = gsea_results@result[gene_set, "NES"]
p.adjust = gsea_results@result[gene_set, "p.adjust"] %>%
round(., 4)
qvalues = gsea_results@result[gene_set, "qvalues"]
if (p.adjust > 0.05) {
p.adjust = paste0("<span style='color:red;'>", p.adjust, "</span>")
}
my_subtitle = paste0("\nNES : ", round(NES, 2),
" | padj : ", p.adjust,
" | qval : ", round(qvalues, 4),
" | set size : ", nb_genes, " genes")
# Size limits
lower_FC = min(fold_change[gs_content, ]$metric, na.rm = TRUE)
upper_FC = max(fold_change[gs_content, ]$metric, na.rm = TRUE)
# Plot
p = enrichplot::gseaplot2(x = gsea_results, geneSetID = gene_set) +
ggplot2::labs(title = gene_set,
subtitle = my_subtitle) +
ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold",
margin = ggplot2::margin(3, 3, 5, 3)),
plot.subtitle = ggtext::element_markdown(hjust = 0.5,
size = 10))
wc = ggplot2::ggplot(fold_change[gs_content, ],
aes(label = gene_name, size = abs(metric), color = metric)) +
ggwordcloud::geom_text_wordcloud_area(show.legend = TRUE) +
ggplot2::scale_color_gradient2(
name = metric,
low = aquarius::color_cnv[1],
mid = "gray70", midpoint = 0,
high = aquarius::color_cnv[3]) +
ggplot2::scale_size_area(max_size = 7) +
ggplot2::theme_minimal() +
ggplot2::guides(size = "none")
return(list(p, wc))
}) %>% unlist(., recursive = FALSE)
return(plot_list)
}
We visualize gene expression for some markers :
features = c("percent.mt", "percent.rb", "nFeature_RNA")
plot_list = lapply(features, FUN = function(one_gene) {
Seurat::FeaturePlot(sobj, features = one_gene,
reduction = name2D) +
ggplot2::theme(aspect.ratio = 1) +
ggplot2::scale_color_gradientn(colors = aquarius::color_gene) +
Seurat::NoAxes()
})
patchwork::wrap_plots(plot_list, ncol = 3)
We visualize clusters :
cluster_plot = Seurat::DimPlot(sobj, reduction = name2D, label = TRUE) +
Seurat::NoAxes() +
ggplot2::theme(aspect.ratio = 1)
cluster_plot
We visualize cell type split by sample :
plot_list = aquarius::plot_split_dimred(sobj,
reduction = name2D,
split_by = "project_name",
group_by = "cell_type",
split_color = setNames(sample_info$color,
nm = sample_info$project_name),
group_color = color_markers,
bg_pt_size = 0.5, main_pt_size = 0.5)
plot_list[[length(plot_list) + 1]] = cluster_plot
patchwork::wrap_plots(plot_list, ncol = 4) &
Seurat::NoLegend()
We compare cluster annotation and cell type annotation :
p1 = Seurat::DimPlot(sobj, group.by = "cell_type",
reduction = name2D, cols = color_markers) +
ggplot2::labs(title = "Cell type") +
Seurat::NoAxes() +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5))
p2 = Seurat::DimPlot(sobj, group.by = "cluster_type",
reduction = name2D, cols = color_markers) +
ggplot2::labs(title = "Cluster type") +
Seurat::NoAxes() +
ggplot2::theme(aspect.ratio = 1,
plot.title = element_text(hjust = 0.5))
patchwork::wrap_plots(p1, p2, guides = "collect")
For each population, what are the differences between healthy donors (HD) and HS patients (HS) ? We save the results in a list :
list_results = list()
We make over-representation analysis for each group of genes. We load gene sets from MSigDB :
gene_sets = aquarius::get_gene_sets(species = "Homo sapiens")
gene_sets = gene_sets$gene_sets
head(gene_sets)
## # A tibble: 6 x 16
## gs_cat gs_subcat gs_name gene_symbol entrez_gene ensembl_gene human_gene_symb~
## <chr> <chr> <chr> <chr> <int> <chr> <chr>
## 1 C5 GO:BP GOBP_1~ AASDHPPT 60496 ENSG0000014~ AASDHPPT
## 2 C5 GO:BP GOBP_1~ ALDH1L1 10840 ENSG0000014~ ALDH1L1
## 3 C5 GO:BP GOBP_1~ ALDH1L2 160428 ENSG0000013~ ALDH1L2
## 4 C5 GO:BP GOBP_1~ MTHFD1 4522 ENSG0000010~ MTHFD1
## 5 C5 GO:BP GOBP_1~ MTHFD1L 25902 ENSG0000012~ MTHFD1L
## 6 C5 GO:BP GOBP_1~ MTHFD2L 441024 ENSG0000016~ MTHFD2L
## # ... with 9 more variables: human_entrez_gene <int>, human_ensembl_gene <chr>,
## # gs_id <chr>, gs_pmid <chr>, gs_geoid <chr>, gs_exact_source <chr>,
## # gs_url <chr>, gs_description <chr>, category <chr>
How many gene sets ?
gene_sets[, c("gs_subcat", "gs_name")] %>%
dplyr::distinct() %>%
dplyr::pull(gs_subcat) %>%
table() %>%
as.data.frame.table() %>%
`colnames<-`(c("Category", "Nb gene sets"))
## Category Nb gene sets
## 1 50
## 2 CP:KEGG 186
## 3 CP:PID 196
## 4 CP:REACTOME 1615
## 5 CP:WIKIPATHWAYS 664
## 6 GO:BP 7658
## 7 GO:CC 1006
## 8 GO:MF 1738
We get gene name and gene ID correspondence :
gene_corresp = sobj@assays[["RNA"]]@meta.features[, c("gene_name", "Ensembl_ID")] %>%
`colnames<-`(c("NAME", "ID")) %>%
dplyr::mutate(ID = as.character(ID))
rownames(gene_corresp) = gene_corresp$ID
head(gene_corresp)
## NAME ID
## ENSG00000238009 AL627309.1 ENSG00000238009
## ENSG00000237491 AL669831.5 ENSG00000237491
## ENSG00000225880 LINC00115 ENSG00000225880
## ENSG00000230368 FAM41C ENSG00000230368
## ENSG00000230699 AL645608.3 ENSG00000230699
## ENSG00000241180 AL645608.5 ENSG00000241180
group_name = "IRS"
IRS are those clusters :
clusters_oi = as.character(unique(sobj$seurat_clusters[sobj$cluster_type == group_name]))
clusters_oi
## [1] "10" "5"
We represent those clusters on the projection :
see_clusters("IRS")
We perform differential expression between HS and HD, within this population :
subsobj = subset(sobj, seurat_clusters %in% clusters_oi)
Seurat::Idents(subsobj) = subsobj$sample_type
table(subsobj$sample_type)
##
## HS HD
## 81 67
We make identify specific markers for these group :
mark = Seurat::FindMarkers(subsobj, ident.1 = "HS", ident.2 = "HD")
mark = mark %>%
dplyr::filter(p_val_adj < 0.05) %>%
dplyr::arrange(-avg_logFC, pct.1 - pct.2)
list_results[[group_name]]$mark = mark
dim(mark)
## [1] 74 5
head(mark, n = 20)
## p_val avg_logFC pct.1 pct.2 p_val_adj
## LGALS7B 3.081625e-09 1.5356124 0.938 0.955 4.911186e-05
## LGALS7 4.722986e-11 1.3241540 0.716 0.373 7.527022e-07
## MIF 3.749898e-11 1.0433933 0.914 0.821 5.976212e-07
## MTRNR2L8 2.959636e-13 0.8968284 1.000 0.970 4.716771e-09
## RPS26 7.161254e-15 0.7478794 1.000 1.000 1.141289e-10
## MTRNR2L10 2.790024e-11 0.7066885 0.790 0.507 4.446462e-07
## MTRNR2L12 1.733980e-10 0.6659660 1.000 0.985 2.763443e-06
## ARF5 2.072975e-12 0.6173838 0.654 0.134 3.303701e-08
## EGR1 1.022240e-08 0.5249253 0.914 0.552 1.629144e-04
## KLK7 1.442664e-07 0.4507268 0.728 0.373 2.299173e-03
## SERPINB5 4.901550e-08 0.4480291 0.975 0.985 7.811600e-04
## NBPF14 1.834105e-06 0.4201037 0.716 0.448 2.923013e-02
## S100A11 2.238558e-06 0.3506447 1.000 0.970 3.567590e-02
## MT-CO1 1.886033e-06 0.3498513 1.000 0.985 3.005771e-02
## COMT 1.746678e-06 0.3464609 0.975 0.970 2.783681e-02
## DYNLL1 1.239554e-09 0.3383409 1.000 1.000 1.975478e-05
## PPP4C 9.455785e-07 0.3234715 0.926 0.866 1.506968e-02
## NDUFB3 1.526526e-06 0.2529496 0.988 0.940 2.432825e-02
## OST4 7.834985e-10 0.2521368 0.988 1.000 1.248662e-05
## RPS28 1.035927e-06 0.2517537 1.000 1.000 1.650958e-02
What are the genes up-regulated in HS ?
genes_of_interest = mark %>%
dplyr::filter(avg_logFC > 0) %>%
rownames()
enrichr_results = aquarius::run_enrichr(gene_names = genes_of_interest,
gene_corresp = gene_corresp,
gene_sets = gene_sets[, c("gs_name", "ensembl_gene")],
make_plot = TRUE,
plot_title = "Down-regulated in HS compared to HD")
list_results[[group_name]]$enrichr_hs = enrichr_results$ego
enrichr_results$plot +
ggplot2::theme(axis.text.y = element_text(size = 8))
What are the genes up-regulated in HD ?
genes_of_interest = mark %>%
dplyr::filter(avg_logFC < 0) %>%
rownames()
enrichr_results = aquarius::run_enrichr(gene_names = genes_of_interest,
gene_corresp = gene_corresp,
gene_sets = gene_sets[, c("gs_name", "ensembl_gene")],
make_plot = TRUE,
plot_title = "Down-regulated in HS compared to HD")
list_results[[group_name]]$enrichr_hd = enrichr_results$ego
enrichr_results$plot +
ggplot2::theme(axis.text.y = element_text(size = 8))
We run a GSEA for all gene sets, from the full count matrix :
ranked_gene_list = aquarius::run_foldchange(Seurat::GetAssayData(subsobj, assay = "RNA", slot = "counts"),
group1 = colnames(subsobj)[subsobj@active.ident %in% "HS"],
group2 = colnames(subsobj)[subsobj@active.ident %in% "HD"])
names(ranked_gene_list) = gene_corresp$ID
gsea_results = aquarius::gsea_run(ranked_gene_list = ranked_gene_list,
gene_sets = gene_sets[, c("gs_name", "ensembl_gene")],
GSEA_p_val_thresh = 1)
list_results[[group_name]]$gsea = gsea_results
gsea_results@result %>%
dplyr::filter(pvalue < 0.05) %>%
dplyr::top_n(., n = 200, wt = abs(NES)) %>%
dplyr::mutate(too_long = ifelse(nchar(ID) > 60, yes = TRUE, no = FALSE)) %>%
dplyr::mutate(ID = stringr::str_sub(ID, end = 60)) %>%
dplyr::mutate(ID = ifelse(too_long, yes = paste0(ID, "..."), no = ID)) %>%
aquarius::gsea_plot(show_legend = TRUE) +
ggplot2::labs(title = "GSEA using all genes (count matrix)") +
ggplot2::theme(plot.title = element_text(size = 20))
We compute a fold change table to make a wordcloud :
fold_change = gene_corresp
colnames(fold_change) = c("gene_name", "ID")
rownames(fold_change) = fold_change$ID
fold_change$FC = ranked_gene_list[rownames(fold_change)]
fold_change$pct.1 = Seurat::GetAssayData(subsobj, assay = "RNA", slot = "counts")[, subsobj@active.ident == "HS"] %>%
apply(., MARGIN = 1, FUN = function(one_row) {
return( mean(one_row != 0) )
})
fold_change$pct.2 = Seurat::GetAssayData(subsobj, assay = "RNA", slot = "counts")[, subsobj@active.ident == "HD"] %>%
apply(., MARGIN = 1, FUN = function(one_row) {
return( mean(one_row != 0) )
})
fold_change$FC_x_pct = ifelse(fold_change$FC > 0,
yes = fold_change$FC * fold_change$pct.1,
no = fold_change$FC * fold_change$pct.2)
dim(fold_change) ; head(fold_change)
## [1] 15937 6
## gene_name ID FC pct.1 pct.2
## ENSG00000238009 AL627309.1 ENSG00000238009 -0.51845506 0.01234568 0.01492537
## ENSG00000237491 AL669831.5 ENSG00000237491 -0.10341756 0.08641975 0.07462687
## ENSG00000225880 LINC00115 ENSG00000225880 0.13362164 0.11111111 0.07462687
## ENSG00000230368 FAM41C ENSG00000230368 -0.10341756 0.03703704 0.02985075
## ENSG00000230699 AL645608.3 ENSG00000230699 -0.19652696 0.09876543 0.10447761
## ENSG00000241180 AL645608.5 ENSG00000241180 0.06650744 0.02469136 0.01492537
## FC_x_pct
## ENSG00000238009 -0.007738135
## ENSG00000237491 -0.007717728
## ENSG00000225880 0.014846849
## ENSG00000230368 -0.003087091
## ENSG00000230699 -0.020532668
## ENSG00000241180 0.001642159
We make the gsea plot for some gene sets :
gs_oi = c("GOBP_KERATINIZATION",
"GOBP_CELL_AGGREGATION",
"GOCC_CORNIFIED_ENVELOPE",
"GOBP_MESENCHYMAL_CELL_DIFFERENTIATION",
"WP_DNA_REPAIR_PATHWAYS_FULL_NETWORK",
"HALLMARK_G2M_CHECKPOINT",
"HALLMARK_E2F_TARGETS")
plot_list = make_gsea_plot(gsea_results, gs_oi, fold_change, "FC_x_pct")
patchwork::wrap_plots(plot_list, ncol = 2, widths = c(2,1.5))
group_name = "medulla"
Medulla are those clusters :
clusters_oi = as.character(unique(sobj$seurat_clusters[sobj$cluster_type == group_name]))
clusters_oi
## [1] "3" "6" "9"
We represent those clusters on the projection :
see_clusters("medulla")
We perform differential expression between HS and HD, within this population :
subsobj = subset(sobj, seurat_clusters %in% c(2))
Seurat::Idents(subsobj) = subsobj$sample_type
table(subsobj$sample_type)
##
## HS HD
## 115 62
We make identify specific markers for these group :
mark = Seurat::FindMarkers(subsobj, ident.1 = "HS", ident.2 = "HD")
mark = mark %>%
dplyr::filter(p_val_adj < 0.05) %>%
dplyr::arrange(-avg_logFC, pct.1 - pct.2)
list_results[[group_name]]$mark = mark
dim(mark)
## [1] 50 5
head(mark, n = 20)
## p_val avg_logFC pct.1 pct.2 p_val_adj
## MT4 6.700532e-11 2.3676425 0.852 0.710 1.067864e-06
## LGALS7 1.050898e-11 1.1734249 0.704 0.290 1.674816e-07
## MIF 7.055503e-11 0.9122395 0.826 0.726 1.124435e-06
## MTRNR2L8 1.932382e-14 0.8160014 1.000 0.984 3.079637e-10
## RPS26 2.698863e-14 0.7474198 1.000 1.000 4.301177e-10
## MTRNR2L10 3.527689e-11 0.6690467 0.791 0.468 5.622078e-07
## ARF5 3.299404e-14 0.6154629 0.704 0.129 5.258260e-10
## C1QTNF12 3.570827e-13 0.4589135 1.000 0.887 5.690826e-09
## MTRNR2L12 5.978296e-12 0.4540986 1.000 1.000 9.527610e-08
## NSMCE3 8.961305e-07 0.4219172 0.957 0.871 1.428163e-02
## TOMM5 4.228664e-10 0.4077463 0.678 0.258 6.739222e-06
## EGR1 1.164623e-06 0.4037074 0.991 0.919 1.856059e-02
## GNG11 2.682451e-06 0.3639743 0.722 0.387 4.275023e-02
## MT-CO1 2.889633e-06 0.3516409 1.000 1.000 4.605208e-02
## DYNLL1 2.648089e-11 0.3459339 1.000 0.984 4.220260e-07
## SMS 1.370304e-06 0.3010088 0.983 0.935 2.183854e-02
## NDUFB3 3.024908e-08 0.3004245 0.948 0.935 4.820796e-04
## AKIRIN2 8.224625e-07 0.2962770 0.835 0.661 1.310759e-02
## SRI 1.392941e-06 0.2870494 0.948 0.790 2.219930e-02
## COMT 1.086327e-06 0.2848493 1.000 0.952 1.731280e-02
We explore enrichment in gene sets for HS population.
genes_of_interest = mark %>%
dplyr::filter(avg_logFC > 0) %>%
rownames()
enrichr_results = aquarius::run_enrichr(gene_names = genes_of_interest,
gene_corresp = gene_corresp,
gene_sets = gene_sets[, c("gs_name", "ensembl_gene")],
make_plot = TRUE,
plot_title = "Up-regulated in HS compared to HD")
list_results[[group_name]]$enrichr_hs = enrichr_results$ego
enrichr_results$plot +
ggplot2::theme(axis.text.y = element_text(size = 8))
We explore enrichment in gene sets for HD population.
genes_of_interest = mark %>%
dplyr::filter(avg_logFC < 0) %>%
rownames()
enrichr_results = aquarius::run_enrichr(gene_names = genes_of_interest,
gene_corresp = gene_corresp,
gene_sets = gene_sets[, c("gs_name", "ensembl_gene")],
make_plot = TRUE,
plot_title = "Down-regulated in HS compared to HD")
list_results[[group_name]]$enrichr_hd = enrichr_results$ego
enrichr_results$plot +
ggplot2::theme(axis.text.y = element_text(size = 8))
We run a GSEA for all gene sets, from the full count matrix :
ranked_gene_list = aquarius::run_foldchange(Seurat::GetAssayData(subsobj, assay = "RNA", slot = "counts"),
group1 = colnames(subsobj)[subsobj@active.ident %in% "HS"],
group2 = colnames(subsobj)[subsobj@active.ident %in% "HD"])
names(ranked_gene_list) = gene_corresp$ID
gsea_results = aquarius::gsea_run(ranked_gene_list = ranked_gene_list,
gene_sets = gene_sets[, c("gs_name", "ensembl_gene")],
GSEA_p_val_thresh = 1)
list_results[[group_name]]$gsea = gsea_results
gsea_results@result %>%
dplyr::filter(pvalue < 0.05) %>%
dplyr::top_n(., n = 200, wt = abs(NES)) %>%
dplyr::mutate(too_long = ifelse(nchar(ID) > 70, yes = TRUE, no = FALSE)) %>%
dplyr::mutate(ID = stringr::str_sub(ID, end = 70)) %>%
dplyr::mutate(ID = ifelse(too_long, yes = paste0(ID, "..."), no = ID)) %>%
aquarius::gsea_plot(show_legend = TRUE) +
ggplot2::labs(title = "GSEA using all genes (count matrix)") +
ggplot2::theme(plot.title = element_text(size = 20))
We compute a fold change table to make a wordcloud :
fold_change = gene_corresp
colnames(fold_change) = c("gene_name", "ID")
rownames(fold_change) = fold_change$ID
fold_change$FC = ranked_gene_list[rownames(fold_change)]
fold_change$pct.1 = Seurat::GetAssayData(subsobj, assay = "RNA", slot = "counts")[, subsobj@active.ident == "HS"] %>%
apply(., MARGIN = 1, FUN = function(one_row) {
return( mean(one_row != 0) )
})
fold_change$pct.2 = Seurat::GetAssayData(subsobj, assay = "RNA", slot = "counts")[, subsobj@active.ident == "HD"] %>%
apply(., MARGIN = 1, FUN = function(one_row) {
return( mean(one_row != 0) )
})
fold_change$FC_x_pct = ifelse(fold_change$FC > 0,
yes = fold_change$FC * fold_change$pct.1,
no = fold_change$FC * fold_change$pct.2)
dim(fold_change) ; head(fold_change)
## [1] 15937 6
## gene_name ID FC pct.1 pct.2
## ENSG00000238009 AL627309.1 ENSG00000238009 -0.4808066 0.01739130 0.01612903
## ENSG00000237491 AL669831.5 ENSG00000237491 -0.1146787 0.21739130 0.16129032
## ENSG00000225880 LINC00115 ENSG00000225880 -0.3653294 0.09565217 0.11290323
## ENSG00000230368 FAM41C ENSG00000230368 1.8411215 0.12173913 0.01612903
## ENSG00000230699 AL645608.3 ENSG00000230699 0.5191934 0.03478261 0.01612903
## ENSG00000241180 AL645608.5 ENSG00000241180 0.5191934 0.01739130 0.00000000
## FC_x_pct
## ENSG00000238009 -0.007754945
## ENSG00000237491 -0.018496562
## ENSG00000225880 -0.041246864
## ENSG00000230368 0.224136532
## ENSG00000230699 0.018058901
## ENSG00000241180 0.009029451
We make the gsea plot for some gene sets :
gs_oi = c("GOMF_RAGE_RECEPTOR_BINDING",
"GOCC_GAMMA_SECRETASE_COMPLEX",
"REACTOME_NOTCH3_ACTIVATION_AND_TRANSMISSION_OF_SIGNAL_TO_THE_NUCLEUS",
"REACTOME_NONCANONICAL_ACTIVATION_OF_NOTCH3",
"GOBP_POSITIVE_REGULATION_OF_CHEMOKINE_C_X_C_MOTIF_LIGAND_2_PRODUCTION",
"GOBP_POSITIVE_REGULATION_OF_HISTONE_H3_K27_METHYLATION",
"GOMF_TRANSFORMING_GROWTH_FACTOR_BETA_BINDING",
"GOMF_STRUCTURAL_MOLECULE_ACTIVITY_CONFERRING_ELASTICITY",
"GOBP_REGULATION_OF_MESENCHYMAL_STEM_CELL_DIFFERENTIATION")
plot_list = make_gsea_plot(gsea_results, gs_oi, fold_change, "FC_x_pct")
patchwork::wrap_plots(plot_list, ncol = 2, widths = c(2,1.5))
group_name = "cortex"
Cortex are those clusters :
clusters_oi = as.character(unique(sobj$seurat_clusters[sobj$cluster_type == group_name]))
clusters_oi
## [1] "12" "0"
We represent those clusters on the projection :
see_clusters("cortex")
We perform differential expression between HS and HD, within this population :
subsobj = subset(sobj, seurat_clusters %in% c(0))
Seurat::Idents(subsobj) = subsobj$sample_type
table(subsobj$sample_type)
##
## HS HD
## 194 97
We make identify specific markers for these group :
mark = Seurat::FindMarkers(subsobj, ident.1 = "HS", ident.2 = "HD")
mark = mark %>%
dplyr::filter(p_val_adj < 0.05) %>%
dplyr::arrange(-avg_logFC, pct.1 - pct.2)
list_results[[group_name]]$mark = mark
dim(mark)
## [1] 90 5
head(mark, n = 20)
## p_val avg_logFC pct.1 pct.2 p_val_adj
## MT4 7.272553e-19 2.5463410 0.856 0.753 1.159027e-14
## LGALS7 9.747132e-23 1.6620818 0.794 0.443 1.553400e-18
## LGALS7B 4.257287e-09 1.3687629 0.928 0.990 6.784838e-05
## MIF 6.101755e-23 1.0505880 0.887 0.784 9.724366e-19
## MTRNR2L8 1.324205e-22 1.0245593 0.933 0.907 2.110385e-18
## KRTAP11-1 7.357915e-10 0.8439837 0.933 0.887 1.172631e-05
## MTRNR2L12 2.169970e-19 0.8137391 0.948 0.928 3.458281e-15
## C10orf99 5.443558e-16 0.7734478 0.742 0.320 8.675398e-12
## TOMM5 2.903241e-25 0.6669403 0.784 0.196 4.626895e-21
## MTRNR2L10 9.299557e-17 0.6493191 0.794 0.526 1.482070e-12
## ARF5 1.110846e-23 0.6491460 0.742 0.155 1.770356e-19
## PRR9 8.804044e-09 0.6337644 1.000 0.990 1.403100e-04
## DAPL1 9.255770e-10 0.5990359 1.000 0.979 1.475092e-05
## RPS26 2.297916e-24 0.5508076 1.000 1.000 3.662189e-20
## MT-CO1 7.910833e-09 0.4960001 0.959 0.938 1.260750e-04
## ALOX15B 3.152614e-10 0.4921725 0.799 0.464 5.024321e-06
## PLCG2 2.052006e-07 0.4913897 0.541 0.227 3.270281e-03
## SERPINB5 3.267773e-12 0.4373238 0.979 0.938 5.207849e-08
## GNG11 6.952996e-08 0.4213139 0.649 0.361 1.108099e-03
## CSTB 2.342789e-13 0.3914171 0.990 1.000 3.733703e-09
We explore enrichment in gene sets for HS population.
genes_of_interest = mark %>%
dplyr::filter(avg_logFC > 0) %>%
rownames()
enrichr_results = aquarius::run_enrichr(gene_names = genes_of_interest,
gene_corresp = gene_corresp,
gene_sets = gene_sets[, c("gs_name", "ensembl_gene")],
make_plot = TRUE,
plot_title = "Up-regulated in HS compared to HD")
list_results[[group_name]]$enrichr_hs = enrichr_results$ego
enrichr_results$plot +
ggplot2::theme(axis.text.y = element_text(size = 8))
We explore enrichment in gene sets for HD population.
genes_of_interest = mark %>%
dplyr::filter(avg_logFC < 0) %>%
rownames()
enrichr_results = aquarius::run_enrichr(gene_names = genes_of_interest,
gene_corresp = gene_corresp,
gene_sets = gene_sets[, c("gs_name", "ensembl_gene")],
make_plot = TRUE,
plot_title = "Down-regulated in HS compared to HD")
list_results[[group_name]]$enrichr_hd = enrichr_results$ego
enrichr_results$plot +
ggplot2::theme(axis.text.y = element_text(size = 8))
We run a GSEA for all gene sets, from the full count matrix :
ranked_gene_list = aquarius::run_foldchange(Seurat::GetAssayData(subsobj, assay = "RNA", slot = "counts"),
group1 = colnames(subsobj)[subsobj@active.ident %in% "HS"],
group2 = colnames(subsobj)[subsobj@active.ident %in% "HD"])
names(ranked_gene_list) = gene_corresp$ID
gsea_results = aquarius::gsea_run(ranked_gene_list = ranked_gene_list,
gene_sets = gene_sets[, c("gs_name", "ensembl_gene")],
GSEA_p_val_thresh = 1)
list_results[[group_name]]$gsea = gsea_results
gsea_results@result %>%
dplyr::filter(pvalue < 0.05) %>%
dplyr::top_n(., n = 200, wt = abs(NES)) %>%
dplyr::mutate(too_long = ifelse(nchar(ID) > 60, yes = TRUE, no = FALSE)) %>%
dplyr::mutate(ID = stringr::str_sub(ID, end = 60)) %>%
dplyr::mutate(ID = ifelse(too_long, yes = paste0(ID, "..."), no = ID)) %>%
aquarius::gsea_plot(show_legend = TRUE) +
ggplot2::labs(title = "GSEA using all genes (count matrix)") +
ggplot2::theme(plot.title = element_text(size = 20))
We compute a fold change table to make a wordcloud :
fold_change = gene_corresp
colnames(fold_change) = c("gene_name", "ID")
rownames(fold_change) = fold_change$ID
fold_change$FC = ranked_gene_list[rownames(fold_change)]
fold_change$pct.1 = Seurat::GetAssayData(subsobj, assay = "RNA", slot = "counts")[, subsobj@active.ident == "HS"] %>%
apply(., MARGIN = 1, FUN = function(one_row) {
return( mean(one_row != 0) )
})
fold_change$pct.2 = Seurat::GetAssayData(subsobj, assay = "RNA", slot = "counts")[, subsobj@active.ident == "HD"] %>%
apply(., MARGIN = 1, FUN = function(one_row) {
return( mean(one_row != 0) )
})
fold_change$FC_x_pct = ifelse(fold_change$FC > 0,
yes = fold_change$FC * fold_change$pct.1,
no = fold_change$FC * fold_change$pct.2)
dim(fold_change) ; head(fold_change)
## [1] 15937 6
## gene_name ID FC pct.1 pct.2
## ENSG00000238009 AL627309.1 ENSG00000238009 0.8497322 0.015463918 0.00000000
## ENSG00000237491 AL669831.5 ENSG00000237491 0.8193585 0.201030928 0.10309278
## ENSG00000225880 LINC00115 ENSG00000225880 -0.4721959 0.067010309 0.09278351
## ENSG00000230368 FAM41C ENSG00000230368 1.0976597 0.092783505 0.03092784
## ENSG00000230699 AL645608.3 ENSG00000230699 -0.7352303 0.010309278 0.02061856
## ENSG00000241180 AL645608.5 ENSG00000241180 -1.1502678 0.005154639 0.01030928
## FC_x_pct
## ENSG00000238009 0.01314019
## ENSG00000237491 0.16471640
## ENSG00000225880 -0.04381199
## ENSG00000230368 0.10184471
## ENSG00000230699 -0.01515939
## ENSG00000241180 -0.01185843
We make the gsea plot for some gene sets :
gs_oi = c("GOCC_KERATIN_FILAMENT",
"GOBP_INTERMEDIATE_FILAMENT_ORGANIZATION",
"WP_CYTOKINES_AND_INFLAMMATORY_RESPONSE",
"GOBP_POSITIVE_REGULATION_OF_HETEROTYPIC_CELL_CELL_ADHESION",
"GOBP_RESPONSE_TO_FUNGUS",
"REACTOME_VOLTAGE_GATED_POTASSIUM_CHANNELS")
plot_list = make_gsea_plot(gsea_results, gs_oi, fold_change, "FC_x_pct")
patchwork::wrap_plots(plot_list, ncol = 2, widths = c(2,1.5))
group_name = "cuticle"
Cuticle are those clusters :
clusters_oi = as.character(unique(sobj$seurat_clusters[sobj$cluster_type == group_name]))
clusters_oi
## [1] "4" "2" "8" "7" "1" "11"
We represent those clusters on the projection :
see_clusters("cuticle")
We perform differential expression between HS and HD, within this population :
subsobj = subset(sobj, seurat_clusters %in% c(1))
Seurat::Idents(subsobj) = subsobj$sample_type
table(subsobj$sample_type)
##
## HS HD
## 125 82
We make identify specific markers for these group :
mark = Seurat::FindMarkers(subsobj, ident.1 = "HS", ident.2 = "HD")
mark = mark %>%
dplyr::filter(p_val_adj < 0.05) %>%
dplyr::arrange(-avg_logFC, pct.1 - pct.2)
list_results[[group_name]]$mark = mark
dim(mark)
## [1] 40 5
head(mark, n = 20)
## p_val avg_logFC pct.1 pct.2 p_val_adj
## MT4 1.110113e-08 2.3978461 0.800 0.720 1.769187e-04
## MIF 8.625579e-11 1.0407902 0.792 0.683 1.374659e-06
## LGALS7 5.257608e-11 0.9610409 0.608 0.232 8.379049e-07
## MTRNR2L8 1.233258e-15 0.8365386 0.992 1.000 1.965444e-11
## MTRNR2L12 3.056034e-19 0.7248124 1.000 1.000 4.870402e-15
## RPS26 7.512204e-19 0.6936653 1.000 0.988 1.197220e-14
## MTRNR2L10 1.874930e-10 0.6633563 0.720 0.488 2.988076e-06
## NSMCE3 3.165578e-15 0.6109144 0.960 0.805 5.044981e-11
## KRT25 1.052574e-06 0.5459221 0.784 0.537 1.677487e-02
## S100A10 4.917155e-08 0.4664376 0.992 0.963 7.836470e-04
## TOMM5 1.346169e-10 0.4352497 0.576 0.171 2.145389e-06
## ARF5 1.629853e-08 0.4252033 0.520 0.183 2.597497e-04
## NDUFB3 1.020655e-10 0.3667876 0.952 0.841 1.626617e-06
## C1QTNF12 1.196426e-08 0.3353051 0.976 0.902 1.906744e-04
## C9orf3 2.660042e-09 0.3281363 0.984 0.976 4.239308e-05
## GNG11 1.073711e-06 0.3205391 0.456 0.159 1.711174e-02
## PPP4C 4.056694e-10 0.3193076 0.824 0.549 6.465153e-06
## CSTB 9.455915e-09 0.3179388 0.968 0.976 1.506989e-04
## S100A11 1.609715e-06 0.3148957 0.992 0.963 2.565403e-02
## GSTO1 5.938944e-07 0.2849684 0.968 0.915 9.464895e-03
We explore enrichment in gene sets for HS population.
genes_of_interest = mark %>%
dplyr::filter(avg_logFC > 0) %>%
rownames()
enrichr_results = aquarius::run_enrichr(gene_names = genes_of_interest,
gene_corresp = gene_corresp,
gene_sets = gene_sets[, c("gs_name", "ensembl_gene")],
make_plot = TRUE,
plot_title = "Up-regulated in HS compared to HD")
list_results[[group_name]]$enrichr_hs = enrichr_results$ego
enrichr_results$plot +
ggplot2::theme(axis.text.y = element_text(size = 8))
We explore enrichment in gene sets for HD population.
genes_of_interest = mark %>%
dplyr::filter(avg_logFC < 0) %>%
rownames()
enrichr_results = aquarius::run_enrichr(gene_names = genes_of_interest,
gene_corresp = gene_corresp,
gene_sets = gene_sets[, c("gs_name", "ensembl_gene")],
make_plot = TRUE,
plot_title = "Down-regulated in HS compared to HD")
list_results[[group_name]]$enrichr_hd = enrichr_results$ego
enrichr_results$plot +
ggplot2::theme(axis.text.y = element_text(size = 8))
We run a GSEA for all gene sets, from the full count matrix :
ranked_gene_list = aquarius::run_foldchange(Seurat::GetAssayData(subsobj, assay = "RNA", slot = "counts"),
group1 = colnames(subsobj)[subsobj@active.ident %in% "HS"],
group2 = colnames(subsobj)[subsobj@active.ident %in% "HD"])
names(ranked_gene_list) = gene_corresp$ID
gsea_results = aquarius::gsea_run(ranked_gene_list = ranked_gene_list,
gene_sets = gene_sets[, c("gs_name", "ensembl_gene")],
GSEA_p_val_thresh = 1)
list_results[[group_name]]$gsea = gsea_results
gsea_results@result %>%
dplyr::filter(pvalue < 0.05) %>%
dplyr::top_n(., n = 200, wt = abs(NES)) %>%
dplyr::mutate(too_long = ifelse(nchar(ID) > 60, yes = TRUE, no = FALSE)) %>%
dplyr::mutate(ID = stringr::str_sub(ID, end = 60)) %>%
dplyr::mutate(ID = ifelse(too_long, yes = paste0(ID, "..."), no = ID)) %>%
aquarius::gsea_plot(show_legend = TRUE) +
ggplot2::labs(title = "GSEA using all genes (count matrix)") +
ggplot2::theme(plot.title = element_text(size = 20))
We compute a fold change table to make a wordcloud :
fold_change = gene_corresp
colnames(fold_change) = c("gene_name", "ID")
rownames(fold_change) = fold_change$ID
fold_change$FC = ranked_gene_list[rownames(fold_change)]
fold_change$pct.1 = Seurat::GetAssayData(subsobj, assay = "RNA", slot = "counts")[, subsobj@active.ident == "HS"] %>%
apply(., MARGIN = 1, FUN = function(one_row) {
return( mean(one_row != 0) )
})
fold_change$pct.2 = Seurat::GetAssayData(subsobj, assay = "RNA", slot = "counts")[, subsobj@active.ident == "HD"] %>%
apply(., MARGIN = 1, FUN = function(one_row) {
return( mean(one_row != 0) )
})
fold_change$FC_x_pct = ifelse(fold_change$FC > 0,
yes = fold_change$FC * fold_change$pct.1,
no = fold_change$FC * fold_change$pct.2)
dim(fold_change) ; head(fold_change)
## [1] 15937 6
## gene_name ID FC pct.1 pct.2
## ENSG00000238009 AL627309.1 ENSG00000238009 1.9954329 0.040 0.00000000
## ENSG00000237491 AL669831.5 ENSG00000237491 0.8510430 0.216 0.12195122
## ENSG00000225880 LINC00115 ENSG00000225880 -0.7415326 0.048 0.09756098
## ENSG00000230368 FAM41C ENSG00000230368 1.1474360 0.056 0.02439024
## ENSG00000230699 AL645608.3 ENSG00000230699 -1.0045671 0.016 0.03658537
## ENSG00000241180 AL645608.5 ENSG00000241180 -0.5895296 0.000 0.00000000
## FC_x_pct
## ENSG00000238009 0.07981732
## ENSG00000237491 0.18382530
## ENSG00000225880 -0.07234465
## ENSG00000230368 0.06425642
## ENSG00000230699 -0.03675245
## ENSG00000241180 0.00000000
We make the gsea plot for some gene sets :
gs_oi = c("GOMF_T_CELL_RECEPTOR_BINDING",
"HALLMARK_PEROXISOME",
"GOBP_TOLL_LIKE_RECEPTOR_3_SIGNALING_PATHWAY")
plot_list = make_gsea_plot(gsea_results, gs_oi, fold_change, "FC_x_pct")
patchwork::wrap_plots(plot_list, ncol = 2, widths = c(2,1.5))
We extract all DE genes :
features_oi = lapply(list_results, `[[`, 1) %>%
lapply(., FUN = rownames) %>%
unlist() %>% unname() %>% unique()
length(features_oi)
## [1] 138
We prepare the scaled expression matrix :
mat_expression = Seurat::GetAssayData(sobj, assay = "RNA", slot = "data")[features_oi, ]
mat_expression = Matrix::t(mat_expression)
mat_expression = dynutils::scale_quantile(mat_expression) # between 0 and 1
mat_expression = Matrix::t(mat_expression)
mat_expression = as.matrix(mat_expression) # not sparse
dim(mat_expression)
## [1] 138 1529
We prepare the heatmap annotation :
ha_top = ComplexHeatmap::HeatmapAnnotation(
cell_type = sobj$cluster_type,
sample_type = sobj$sample_type,
cluster = sobj$seurat_clusters,
col = list(cell_type = color_markers,
sample_type = setNames(nm = c("HS", "HD"),
c("#C55F40", "#2C78E6")),
cluster = setNames(nm = levels(sobj$seurat_clusters),
aquarius::gg_color_hue(length(levels(sobj$seurat_clusters))))))
And the heatmap :
sobj$cell_group = paste0(sobj$cluster_type, sobj$sample_type) %>%
as.factor()
ht = ComplexHeatmap::Heatmap(mat_expression,
col = aquarius::color_cnv,
# Annotation
top_annotation = ha_top,
# Grouping
column_order = sobj@meta.data %>%
dplyr::arrange(cluster_type, sample_type, seurat_clusters) %>%
rownames(),
column_split = sobj$cell_group,
column_gap = rep(unit(c(0.01, 2), "mm"), 4),
column_title = NULL,
cluster_rows = FALSE,
cluster_columns = FALSE,
show_column_names = FALSE,
# Visual aspect
show_heatmap_legend = TRUE,
border = TRUE)
ComplexHeatmap::draw(ht,
merge_legend = TRUE,
heatmap_legend_side = "bottom",
annotation_legend_side = "bottom")
plot_list = lapply(sort(features_oi), FUN = function(one_gene) {
p = Seurat::VlnPlot(sobj, group.by = "cluster_type", split.by = "sample_type",
features = one_gene) +
ggplot2::scale_fill_manual(breaks = c("HS", "HD"),
values = c("#C55F40", "#2C78E6")) +
ggplot2::theme(plot.subtitle = element_text(hjust = 0.5),
axis.title.x = element_blank(),
legend.position = "none")
return(p)
})
names(plot_list) = sort(features_oi)
patchwork::wrap_plots(plot_list, ncol = 6)